library(knitr)
library(data.table)
library(plyr)
library(dplyr)
library(Rtsne)
library(pca3d)
library(scatterplot3d)
library(rgl)
library(kernlab)
library(caret)

For this assignment I will use data from the Kaggle competition “Give me some credit”.
This competition requires participants to improve on the state of the art in credit scoring, by predicting the probability that somebody will experience financial distress in the next two years.
I prepared this data in the assignment of preparacion de datos en R, see other html file.

setwd("/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica")
mydata <- read.csv("Train.csv")
mydata %>% head() %>% kable(row.names = F)
SD Age RUUL DebtRatio OpenCredit REloans Dep no3059 LateDays logIncome Debt Slack
1 -0.4939023 1.2448585 1.8402257 0.8836540 4.4647142 1.1416911 2.5165915 0.6509246 0.6966108 3.3016944 -0.8600639
0 -0.8323977 1.7805980 -0.8373704 -0.8652939 -0.9119619 0.2375055 -0.3518966 -0.3103085 -0.8503891 -1.0156682 -0.6902144
0 -0.9677959 0.9421161 -0.9818942 -1.2539490 -0.9119619 -0.6666802 1.0823475 1.6121577 -0.6568490 -1.0513858 -0.5155996
0 -1.5093885 -0.2480561 -1.1747757 -0.6709664 -0.9119619 -0.6666802 -0.3518966 -0.3103085 -0.5564974 -1.1376249 -0.3766131
0 -0.2231059 1.6406179 -1.2185069 -0.2823113 -0.0158492 -0.6666802 1.0823475 0.1703081 1.3519931 -0.9725529 3.7977272
0 1.4693711 -0.3059173 0.1601081 -1.0596214 -0.0158492 0.2375055 -0.3518966 -0.3103085 -0.4839642 -0.4008472 -0.7243528
setwd("/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica")
DD <- read.csv("DataDictionary.csv", sep = ';')
DD %>% kable(row.names = FALSE)
Variable.Name Description Type
SD SeriousDlqin2yrs: person experienced 90 days past due delinquency or worse Y/N
Age Age of borrower in years integer
RUUL Revolving Utilization of Unsecured Lines: Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits percentage
DebtRatio Monthly debt payments, alimony, living costs divided by monthy gross income percentage
OpenCredit Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards) integer
REloans Number of mortgage and real estate loans including home equity lines of credit integer
Dep Number of dependents in family excluding themselves (spouse, children etc.) integer
no3059 Number of times borrower has been 30-59 days past due but no worse in the last 2 years. integer
LateDays The sum of the number of days a person was late with payement. integer
logIncome Logarithm of monthly income real
Debt Monthly debt payments, alimony, living costs real
Slack What a person has left when he subtracts his debt from his income. real
apply(mydata,2,mean)
##            SD           Age          RUUL     DebtRatio    OpenCredit 
##  6.684000e-02 -2.155537e-17 -2.037136e-16 -1.772317e-16  4.476466e-17 
##       REloans           Dep        no3059      LateDays     logIncome 
## -1.825765e-16 -6.843217e-16 -7.003043e-16  1.935481e-16  8.727115e-17 
##          Debt         Slack 
## -3.314229e-17 -3.302383e-17
apply(mydata,2,var)
##         SD        Age       RUUL  DebtRatio OpenCredit    REloans 
## 0.06237283 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 
##        Dep     no3059   LateDays  logIncome       Debt      Slack 
## 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000

This data has already been centered and scaled for the “preparacion de data” assignment.
Because the data is highly unbalanced, I will take a balanced sample with 20k datapoints.

# Split the dataset
mydata_split <- split(mydata, as.factor(mydata$SD))
SD0 <- mydata_split$'0'
SD1 <- mydata_split$'1'
n <- nrow(SD1)
# Sample from SD0
set.seed(42)
SD0_sample10k <- sample_n(SD0, n)
# Merge into balanced new dataset
mydata_balanced <- rbind(SD0_sample10k, SD1)
mydata <- mydata_balanced[sample(2*n),]
summary(mydata)
##        SD           Age               RUUL           DebtRatio      
##  Min.   :0.0   Min.   :-2.1187   Min.   :-0.9038   Min.   :-1.3165  
##  1st Qu.:0.0   1st Qu.:-0.9001   1st Qu.:-0.6989   1st Qu.:-0.6067  
##  Median :0.5   Median :-0.2231   Median : 0.3468   Median : 0.0000  
##  Mean   :0.5   Mean   :-0.2034   Mean   : 0.4975   Mean   : 0.1091  
##  3rd Qu.:1.0   3rd Qu.: 0.4539   3rd Qu.: 1.7440   3rd Qu.: 0.4619  
##  Max.   :1.0   Max.   : 3.2972   Max.   : 2.8884   Max.   : 6.1656  
##    OpenCredit          REloans              Dep          
##  Min.   :-1.64260   Min.   :-0.91196   Min.   :-0.66668  
##  1st Qu.:-0.86529   1st Qu.:-0.91196   1st Qu.:-0.66668  
##  Median :-0.28231   Median :-0.01585   Median :-0.66668  
##  Mean   :-0.05037   Mean   :-0.01527   Mean   : 0.08252  
##  3rd Qu.: 0.49500   3rd Qu.: 0.88026   3rd Qu.: 0.23750  
##  Max.   : 9.43407   Max.   :17.01029   Max.   : 8.37518  
##      no3059           LateDays         logIncome        
##  Min.   :-0.3519   Min.   :-0.3103   Min.   :-10.54352  
##  1st Qu.:-0.3519   1st Qu.:-0.3103   1st Qu.: -0.46865  
##  Median :-0.3519   Median :-0.3103   Median :  0.18046  
##  Mean   : 0.4717   Mean   : 0.6845   Mean   : -0.07508  
##  3rd Qu.: 1.0823   3rd Qu.: 1.1315   3rd Qu.:  0.33475  
##  Max.   :18.2933   Max.   :24.2011   Max.   :  1.35199  
##       Debt              Slack        
##  Min.   :-1.21093   Min.   :-4.5786  
##  1st Qu.:-0.76608   1st Qu.:-0.7813  
##  Median : 0.04979   Median :-0.1115  
##  Mean   : 0.03707   Mean   :-0.1466  
##  3rd Qu.: 0.35207   3rd Qu.: 0.1544  
##  Max.   :12.61682   Max.   : 3.9328

Because of sampling the data isn’t scaled anymore.

1. Linear methods: PCA

Without scaling:

set.seed(42)
pca_mydata <- prcomp(mydata[,-1], scale. = F)
Cols=function(vec){cols=c("green", "red")
+ return(cols[as.numeric(as.factor(vec))]) }
labels <- mydata$SD
par(mfrow=c(1,1))
plot(pca_mydata$x[,1:2], col=Cols(labels))
title(main = "PCA without scaling the datasample, 20k datapoints")

The red points are the people with a serious delinquency. The variance of these points (SD=1) is a lot bigger than the green points (SD=0).

biplot(pca_mydata, xlabs=rep(".", nrow(mydata)))
title(main = "biplot of PCA without scaling the datasample, 20k datapoints")

LateDays, no3059 and RUUL are important variables in PC1.
Debt, REloans, DebtRatio and OpenCredit are important variable in PC2. We can see this is the rotation matrix as well:

pca_mydata$rotation %>% kable(row.names = T)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11
Age 0.0789108 -0.0962894 -0.2250895 -0.3616954 0.3120379 -0.0174741 0.7979091 -0.0773024 0.2481587 0.0077937 -0.0325819
RUUL -0.2602313 0.1187996 0.2649503 0.3731773 -0.2761405 0.5577852 0.3144218 -0.4430075 0.1557705 -0.0303642 0.0320215
DebtRatio -0.0790184 -0.4479517 0.4375161 -0.0111380 0.0844829 0.0035208 0.1810887 0.2115911 -0.3028204 -0.2141888 0.6158410
OpenCredit 0.0308925 -0.3994792 -0.1328818 -0.1748884 0.0409093 -0.1594973 -0.1762682 -0.8045933 -0.2891627 -0.0365625 -0.0266978
REloans 0.0149449 -0.5031886 -0.0511274 0.1158610 0.0694128 0.0412832 -0.2856727 0.0193859 0.7762022 -0.1904019 0.0573602
Dep -0.0535120 -0.0767410 -0.0462910 0.4751677 -0.3901127 -0.7119685 0.3132208 -0.0261826 0.0602562 -0.0318648 -0.0281789
no3059 -0.5441232 -0.1458542 -0.1779235 -0.5226752 -0.5941675 0.0360914 -0.0125418 0.1494060 0.0284371 0.0114374 -0.0073384
LateDays -0.7822205 0.0970139 -0.1147108 0.1873515 0.5487051 -0.1528488 -0.0713505 -0.0187313 -0.0219470 0.0098104 0.0007204
logIncome 0.0319078 -0.1742720 -0.5051379 0.2601027 -0.0257921 0.2749603 0.0793516 0.1882096 -0.3101064 -0.6212698 -0.2136899
Debt -0.0286952 -0.5396560 -0.0006506 0.2247357 0.0324220 0.1945535 0.0887371 0.2130386 -0.1877077 0.6264435 -0.3709634
Slack 0.0773771 0.0706979 -0.6044181 0.1831596 -0.0687971 0.1280452 -0.0384856 -0.0329381 0.0051310 0.3686470 0.6561435

How many principal components are sufficient in this case?

# Percent Variance Explained
pr.var = pca_mydata$sdev^2
pve = pr.var/sum(pr.var)
pve
##  [1] 0.32005368 0.20384070 0.11756412 0.08471263 0.07862810 0.05886057
##  [7] 0.04138278 0.03665455 0.03382128 0.01231714 0.01216444

P1 explains 32% of the variance in the data and CP2 20%.

par(mfrow=c(1,2))
plot(pve, xlab="Principal Component", ylab="Percent Variance Explained", ylim=c(0,1), type='b')
plot(cumsum(pve), xlab="Principal Component", ylab="Accumulated Percent Variance Explained",
     ylim=c(0,1), type='b', col='blue') + abline(h=0.75, col='red')

## integer(0)

We can see in the figure that with 5 principal components we can describe more than 75% of the variance in the data, which is used as a rule of thumb in practice.

With scaling:

set.seed(42)
pca_mydata <- prcomp(mydata[,-1], scale. = T)
par(mfrow=c(1,1))
plot(pca_mydata$x[,1:2], col=Cols(labels))
title(main = "PCA after scaling the datasample, 20k datapoints")

We can see that scaling makes a big difference.

biplot(pca_mydata, xlabs=rep(".", nrow(mydata)))
title(main = "biplot of PCA after scaling the datasample")

In the scaled case the principal components consists of a different combination of the original variables, which we can also see in the rotation matrix:

pca_mydata$rotation %>% kable(row.names = T)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11
Age 0.1338299 -0.2799918 0.1040284 0.5863831 0.2563284 0.6616306 -0.1727690 0.1117026 -0.0498581 0.0170975 0.0215189
RUUL -0.1261597 0.4210614 -0.2631900 -0.1209986 -0.3989183 0.2999257 -0.6543846 0.2040655 -0.0377837 0.0131549 -0.0482449
DebtRatio 0.4182097 0.3320304 0.2504661 -0.0328654 -0.0819929 0.1929248 0.1071547 -0.3473466 0.0117690 -0.3056489 -0.6175830
OpenCredit 0.4340212 -0.0857793 0.0311074 0.1453512 0.2596008 -0.4756121 -0.6010691 -0.0908545 0.3405589 0.0550962 -0.0043384
REloans 0.4909171 -0.0079647 -0.0016664 -0.0747607 -0.0834084 -0.0959248 0.1922375 0.7883077 -0.1445887 0.1602319 -0.1795049
Dep 0.0823588 0.0850987 -0.2545554 -0.5854808 0.6977889 0.2962234 -0.0381815 0.0020099 -0.0031665 0.0474922 -0.0019825
no3059 0.0757403 0.3543293 -0.3957709 0.3869452 0.2362349 -0.2554952 0.0335292 -0.1353367 -0.6491211 -0.0043596 0.0212324
LateDays -0.0417371 0.3925937 -0.4123410 0.3224337 0.0833002 0.0376116 0.3313530 0.1198954 0.6611620 -0.0189046 0.0111243
logIncome 0.2389017 -0.3379409 -0.4755298 -0.0535116 -0.2891626 0.1175131 0.0828085 -0.3544060 0.0175962 0.5725837 -0.2041637
Debt 0.5372637 0.0662357 -0.0713837 -0.1163259 -0.2377515 0.1629084 0.1017526 -0.1622569 -0.0137154 -0.2728545 0.7028156
Slack -0.0437274 -0.4702489 -0.4832422 -0.0290669 -0.0655098 -0.0697987 -0.0512108 0.1045021 -0.0077873 -0.6873645 -0.2175976

This time LateDays, no3059 and RUUL are mapped into PC2.

How many principal components are sufficient in this case?

# Percent Variance Explained
pr.var = pca_mydata$sdev^2
pve = pr.var/sum(pr.var)
pve
##  [1] 0.24802228 0.20157463 0.15011676 0.10426470 0.07586700 0.05980580
##  [7] 0.04673449 0.04179853 0.03818827 0.01864291 0.01498462

This time P1 explains 25% of the variance in the data and CP2 20%.

par(mfrow=c(1,2))
plot(pve, xlab="Principal Component", ylab="Percent Variance Explained", ylim=c(0,1), type='b')
plot(cumsum(pve), xlab="Principal Component", ylab="Accumulated Percent Variance Explained",
     ylim=c(0,1), type='b', col='blue') + abline(h=0.75, col='red')

## integer(0)

Again we see that we need 5 principal components to describe more than 75% of the variance.

PCA 3D

gr <- factor(mydata[,1])
colors = mapvalues(gr, from = c(0, 1), to = c("green", "red"))
summary(gr)
##     0     1 
## 10026 10026
pca3d(pca_mydata, group = gr, col = colors)
## [1] 0.2727411 0.1550222 0.1535694
## Creating new device
snapshotPCA3d(file="PCA3D_20k.png")
Screenshot of output PCA3D with 20.000 points
Screenshot of output PCA3D with 20.000 points

Smaller sample

I now take a smaller sample, which is important for using t-SNE later on.
For this sample I take 500 datapoints with SD=0 and 500 datapoints with SD=1.

set.seed(42)
SD0_sample_500 <- sample_n(SD0, 500)
SD1_sample_500 <- sample_n(SD1, 500)
mydata_balanced_1k <- rbind(SD0_sample_500, SD1_sample_500)
mydata <- mydata_balanced_1k[sample(1000),]
summary(mydata)
##        SD           Age               RUUL           DebtRatio      
##  Min.   :0.0   Min.   :-2.0510   Min.   :-0.9038   Min.   :-1.3165  
##  1st Qu.:0.0   1st Qu.:-0.9001   1st Qu.:-0.7070   1st Qu.:-0.5593  
##  Median :0.5   Median :-0.2231   Median : 0.3354   Median : 0.0000  
##  Mean   :0.5   Mean   :-0.1811   Mean   : 0.4848   Mean   : 0.1580  
##  3rd Qu.:1.0   3rd Qu.: 0.3862   3rd Qu.: 1.6450   3rd Qu.: 0.5456  
##  Max.   :1.0   Max.   : 2.8911   Max.   : 2.8884   Max.   : 6.0776  
##    OpenCredit          REloans              Dep              no3059       
##  Min.   :-1.64260   Min.   :-0.91196   Min.   :-0.6667   Min.   :-0.3519  
##  1st Qu.:-0.67097   1st Qu.:-0.91196   1st Qu.:-0.6667   1st Qu.:-0.3519  
##  Median :-0.08798   Median :-0.01585   Median :-0.6667   Median :-0.3519  
##  Mean   : 0.01015   Mean   : 0.01193   Mean   : 0.1453   Mean   : 0.5474  
##  3rd Qu.: 0.49500   3rd Qu.: 0.88026   3rd Qu.: 1.1417   3rd Qu.: 1.0823  
##  Max.   : 7.87944   Max.   : 9.84139   Max.   : 6.5668   Max.   :11.1221  
##     LateDays         logIncome              Debt        
##  Min.   :-0.3103   Min.   :-10.54352   Min.   :-1.2109  
##  1st Qu.:-0.3103   1st Qu.: -0.40410   1st Qu.:-0.7249  
##  Median :-0.3103   Median :  0.21230   Median : 0.0596  
##  Mean   : 0.7860   Mean   : -0.02264   Mean   : 0.1020  
##  3rd Qu.: 1.1315   3rd Qu.:  0.40187   3rd Qu.: 0.4679  
##  Max.   :15.0694   Max.   :  1.35199   Max.   : 7.5926  
##      Slack         
##  Min.   :-2.68051  
##  1st Qu.:-0.75851  
##  Median :-0.08469  
##  Mean   :-0.10551  
##  3rd Qu.: 0.25036  
##  Max.   : 3.93283
set.seed(42)
pca_mydata <- prcomp(mydata[,-1], scale. = T)
gr <- factor(mydata[,1])
colors = mapvalues(gr, from = c(0, 1), to = c("green", "red"))
summary(gr)
##   0   1 
## 500 500
pca3d(pca_mydata, group = gr, show.ellipses=TRUE, ellipse.ci=0.75, show.plane=F, col = colors)
## [1] 0.1760447 0.1136925 0.1375280
snapshotPCA3d(file="PCA3D_1k.png")
Screenshot of output PCA3D with 1000 points, elips showing 75% confidence interval
Screenshot of output PCA3D with 1000 points, elips showing 75% confidence interval

2. Non-Linear methods: t-SNE

The TSNE algorithm can be used to map high dimensional data sets onto a 2 or 3 dimensional graph.
I’ll use the reduced dataset, because when the number of observations is too large (more than 10.000), the algortihm takes a lot of time and the visualizaton doesn’t work.
A feature of t-SNE is a tuneable parameter “perplexity”, which says (loosely) how to balance attention between local and global aspects of your data. The parameter is, in a sense, a guess about the number of close neighbors each point has. I will vary the perplexity parameter, to see what effect it has on my data.

mydata_matrix <- as.matrix(unique(mydata[,-1]))
set.seed(42)
tsne_out <- Rtsne(mydata_matrix, dims = 2, perplexity=30, verbose=TRUE, max_iter = 500) # Run TSNE
## Read the 1000 x 11 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 1000
## Done in 0.10 seconds (sparsity = 0.128138)!
## Learning embedding...
## Iteration 50: error is 69.757288 (50 iterations in 0.46 seconds)
## Iteration 100: error is 67.910138 (50 iterations in 0.45 seconds)
## Iteration 150: error is 67.901686 (50 iterations in 0.41 seconds)
## Iteration 200: error is 67.901316 (50 iterations in 0.42 seconds)
## Iteration 250: error is 67.901649 (50 iterations in 0.42 seconds)
## Iteration 300: error is 1.462661 (50 iterations in 0.41 seconds)
## Iteration 350: error is 1.288151 (50 iterations in 0.42 seconds)
## Iteration 400: error is 1.242164 (50 iterations in 0.42 seconds)
## Iteration 450: error is 1.220981 (50 iterations in 0.42 seconds)
## Iteration 500: error is 1.211245 (50 iterations in 0.42 seconds)
## Fitting performed in 4.25 seconds.
save(tsne_out, file="/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica/tsne_out_per30.RData")
plot(tsne_out$Y, col=Cols(mydata$SD))
title(main = "t-SNE, perplexity=30")

mydata_matrix <- as.matrix(unique(mydata[,-1]))
set.seed(42)
tsne_out <- Rtsne(mydata_matrix, dims = 2, perplexity=50, verbose=TRUE, max_iter = 500) # Run TSNE
## Read the 1000 x 11 data matrix successfully!
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 1000
## Done in 0.17 seconds (sparsity = 0.213184)!
## Learning embedding...
## Iteration 50: error is 63.671460 (50 iterations in 0.51 seconds)
## Iteration 100: error is 63.154005 (50 iterations in 0.53 seconds)
## Iteration 150: error is 62.757511 (50 iterations in 0.46 seconds)
## Iteration 200: error is 62.757626 (50 iterations in 0.46 seconds)
## Iteration 250: error is 62.757590 (50 iterations in 0.46 seconds)
## Iteration 300: error is 1.274432 (50 iterations in 0.45 seconds)
## Iteration 350: error is 1.146637 (50 iterations in 0.45 seconds)
## Iteration 400: error is 1.109491 (50 iterations in 0.45 seconds)
## Iteration 450: error is 1.097372 (50 iterations in 0.45 seconds)
## Iteration 500: error is 1.092265 (50 iterations in 0.44 seconds)
## Fitting performed in 4.66 seconds.
save(tsne_out, file="/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica/tsne_out_per50.RData")
plot(tsne_out$Y, col=Cols(mydata$SD))
title(main = "t-SNE, perplexity=50")

mydata_matrix <- as.matrix(unique(mydata[,-1]))
set.seed(42)
tsne_out <- Rtsne(mydata_matrix, dims = 2, perplexity=100, verbose=TRUE, max_iter = 500) # Run TSNE
## Read the 1000 x 11 data matrix successfully!
## Using no_dims = 2, perplexity = 100.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 1000
## Done in 0.36 seconds (sparsity = 0.418520)!
## Learning embedding...
## Iteration 50: error is 55.392553 (50 iterations in 0.61 seconds)
## Iteration 100: error is 55.392553 (50 iterations in 0.58 seconds)
## Iteration 150: error is 55.392553 (50 iterations in 0.57 seconds)
## Iteration 200: error is 55.392553 (50 iterations in 0.60 seconds)
## Iteration 250: error is 55.392553 (50 iterations in 0.62 seconds)
## Iteration 300: error is 0.978117 (50 iterations in 0.62 seconds)
## Iteration 350: error is 0.859854 (50 iterations in 0.55 seconds)
## Iteration 400: error is 0.842660 (50 iterations in 0.53 seconds)
## Iteration 450: error is 0.836733 (50 iterations in 0.54 seconds)
## Iteration 500: error is 0.833816 (50 iterations in 0.54 seconds)
## Fitting performed in 5.77 seconds.
save(tsne_out, file="/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica/tsne_out_per100.RData")
plot(tsne_out$Y, col=Cols(mydata$SD))
title(main = "t-SNE, perplexity=100")

When I set perplexity=100 it gives the best separation of the red and green points.

mydata_matrix <- as.matrix(unique(mydata[,-1]))
set.seed(42)
tsne_out <- Rtsne(mydata_matrix, dims = 3, perplexity=100, verbose=TRUE, max_iter = 500) # Run TSNE
## Read the 1000 x 11 data matrix successfully!
## Using no_dims = 3, perplexity = 100.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 1000
## Done in 0.36 seconds (sparsity = 0.418520)!
## Learning embedding...
## Iteration 50: error is 55.392553 (50 iterations in 0.94 seconds)
## Iteration 100: error is 55.392553 (50 iterations in 0.85 seconds)
## Iteration 150: error is 55.392553 (50 iterations in 0.87 seconds)
## Iteration 200: error is 55.392553 (50 iterations in 0.91 seconds)
## Iteration 250: error is 55.392553 (50 iterations in 0.96 seconds)
## Iteration 300: error is 0.816494 (50 iterations in 0.94 seconds)
## Iteration 350: error is 0.703511 (50 iterations in 0.87 seconds)
## Iteration 400: error is 0.683349 (50 iterations in 0.91 seconds)
## Iteration 450: error is 0.672979 (50 iterations in 0.88 seconds)
## Iteration 500: error is 0.668926 (50 iterations in 0.88 seconds)
## Fitting performed in 9.01 seconds.
save(tsne_out, file="/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica/tsne_out_per100.RData")
plot(tsne_out$Y, col=Cols(mydata$SD))
title(main = "t-SNE, perplexity=100, dims=3")

scatterplot3d(x=tsne_out$Y[,1],y=tsne_out$Y[,2],z=tsne_out$Y[,3], color = Cols(mydata$SD))
title(main = "t-SNE, perplexity=100, dims=3")

plot3d(x=tsne_out$Y[,1],y=tsne_out$Y[,2],z=tsne_out$Y[,3],
       col = Cols(mydata$SD),
       type="s",radius=0.5)
Screenshot of output t-SNE with 1000 points
Screenshot of output t-SNE with 1000 points

Conclusions

This visualization with t-SNE shows that it should be reasonably possible to separate the data in a higher dimension (in my case 11 dimensions). Although a fraction of the points will be misclassified.

From this analysis I would say that it seems like a good idea to keep the 11 variables that I obtained after data preparacion and try to use a nonlinear algorithm (like a neural network) to predict the probability that somebody will experience financial distress in the next two years. I have a lot of data to train the network on. Although it would be interesting to compare with the results of linear classification, either with PCA or keeping only the most ‘important’ variables. Because PCA suggests (looking at the 3D graph of 20.000 datapoints) that up to a certain accuracy the data are linearly separable.

Extra: KernelPCA

The idea of kernelPCA is to first transform the data to another space using a kernel and do PCA in that space.
I will use a Radial Basis Kernel, and look at different values of sigma.

kpc <- kpca(~., data = mydata[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0001), features = 2)
plot(rotated(kpc), col=Cols(mydata$SD))
title(main = "KernelPCA (without scaling the datasample), Radial Basis kernel, sigma=0.0001")

kpc <- kpca(~., data = mydata[,-1], kernel = "rbfdot", kpar = list(sigma = 0.01), features = 2)
plot(rotated(kpc), col=Cols(mydata$SD))
title(main = "KernelPCA (without scaling the datasample), Radial Basis kernel, sigma=0.01")

kpc <- kpca(~., data = mydata[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0001), features = 3)
plot3d(x=rotated(kpc)[,1],y=rotated(kpc)[,2],z=rotated(kpc)[,3],
       col = Cols(mydata$SD),
       type="s",radius=0.1)
KernelPCA (without scaling the datasample), Radial Basis kernel, sigma=0.0001
KernelPCA (without scaling the datasample), Radial Basis kernel, sigma=0.0001

I thought that scaling has to occur after applying the kernel, but I’m not sure if kpca does this automatically. Let’s see what happens when we scale the data first.

trans = preProcess(mydata[,-1], 
                   method=c("center", "scale"))
mydata_norm = predict(trans, mydata)
kpc <- kpca(~., data = mydata_norm[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0001), features = 2)
plot(rotated(kpc), col=Cols(mydata$SD))
title(main = "KernelPCA (after scaling datasample), Radial Basis kernel, sigma=0.0001")

kpc <- kpca(~., data = mydata_norm[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0001), features = 3)
plot3d(x=rotated(kpc)[,1],y=rotated(kpc)[,2],z=rotated(kpc)[,3],
       col = Cols(mydata$SD),
       type="s",radius=0.1)
KernelPCA (after scaling datasample), Radial Basis kernel, sigma=0.0001
KernelPCA (after scaling datasample), Radial Basis kernel, sigma=0.0001

I’m not sure if scaling before kpca makes things better…